rm(list=ls())
##set the data path
setwd("E:/data/cooperation/patent/ndata/internet_patent")
##load the library----
library(foreign)
library(xlsxjars)
library(xlsx)
library(ggplot2)
library(gridExtra)
library(randomForest)
library(readstata13)
library(stringr)

##data processing----
da<-read.dta13("govern.dta")
da$lpat_uspt<-log(da$pat_uspt+1)
da<-subset(da,select=-pat_uspt)
lab<-labels(da)[[2]]
##select the main variables based on theories of innovation and government
va<-c("monetaryfreedom","al_ethnic","wdi_trade","lgdppc","vdem_corr","kun_wiqreco_all","fe_cultdiv",
      "urbaniz","lpop_dnst","laborfreedom","wbgi_pse","fincialfreedom","wbgi_gee","fh_fotpsc","open","yr_sch_ter",
      "wdi_internetuse","taxburden","wdi_telephone","lpat_uspt","wdi_internetserv",
      "ciri_polpris","fh_ipolity2","hf_efiscore","fh_rol","une_pee","al_language")
da1<-da[,match(va,labels(da)[[2]])]
##machine learning----
result<-randomForest(lpat_uspt~.,data=da1,importance=TRUE, na.action=na.omit,ntree=1000)
im<-importance(result)##Table 1A: The Importance Degree of Variables from Random Forest
write.csv(im,file="rank.csv")
##get the original data
var<-va[-which(va=="lpat_uspt")]
num<-match(var,lab) 
da2<-da[,num]
write.dta(da2,file="draft.dta")
#drop the less important variables
va<-c("lpat_uspt","wdi_internetuse","yr_sch_ter","lpop_dnst","fh_ipolity2","hf_efiscore","lgdppc","fe_cultdiv",
      "wbgi_pse","wbgi_gee","ciri_polpris","wdi_telephone","taxburden","fincialfreedom","open")
var<-c(va,"year","countrycode","regioncode")# add the variables of year and countrycode
num<-match(var,lab) 
da<-da[,num]
write.dta(da,file="original.dta")##save the original data after machine learning

###check and analyze the rd_GDP-----
da<-read.dta13("govern.dta")
da$lpat_uspt<-log(da$pat_uspt+1)
da<-subset(da,select=-pat_uspt)
lab<-labels(da)[[2]]
va<-c("lpat_uspt","wdi_internetuse","lgdppc","year","countrycode","rd_GDP")
num<-match(va,lab) 
da<-da[,num]
country<-names(table(da$countrycode))
rd_count<-vector(length=length(country))
for (i in 1:length(country))
{
  rd_count[i]<-mean(da$rd_GDP[which(da$countrycode==country[i])],na.rm = T)
}
write.dta(da,file="rd.dta")
rd_count<-data.frame(rd_count)

summary(rd_count)
sd(rd_count$rd_count,na.rm = T)
ggplot(rd_count,aes(rd_count))+
  geom_histogram(aes(y=..density..),bins = 80,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="grey10")+
  theme_bw()+
  labs(x="R&D Share in GDP(%)",y="Density")+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

##fig 2----
da<-read.dta("original.dta")
ggplot(da,aes(x=wdi_internetuse))+
  geom_point(aes(y=lpat_uspt,color=year))+
  geom_smooth(aes(y=lpat_uspt),method="loess",color=alpha("red",0.5),size=1)+
  scale_colour_gradient(low = "grey",high = "black")+
  theme_bw()+
  theme(title=element_text(size=10))+
  labs(y="Log of the Patent Counts",x="the Internet Penetration")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.line = element_line())

##split the level of the democracy----
country<-names(table(da$countrycode))
FirstQu_polity<-vector(length=length(country))
Median_polity<-vector(length=length(country))
Mean_polity<-vector(length=length(country))
ThirdQu_polity<-vector(length=length(country))
group<-vector(length=length(country))
da_polity<-data.frame(country,FirstQu_polity,Median_polity,Mean_polity,ThirdQu_polity)
for (i in 1:length(country))
{
  d<-summary(da[which(da$countrycode==da_polity[i,1]),5])
  da_polity[i,2]<-d[[2]]
  da_polity[i,3]<-d[[3]]
  da_polity[i,4]<-d[[4]]
  da_polity[i,5]<-d[[5]]
}
me<-mean(da_polity$Mean_polity)##6.686439
da_polity<-data.frame(da_polity,group=group)
for (i in 1:length(country))
{
  da_polity$group<-as.numeric(da_polity$Mean_polity>=me)
}
highcountry<-da_polity[da_polity$group==1,1]
lowcountry<-da_polity[da_polity$group==0,1]
##fig 3----
ggplot(da_polity,aes(Mean_polity))+
  geom_histogram(aes(y=..density..),bins = 50,fill="white",color="black")+
  stat_density(geom='line',position='identity',size=1,color="grey10")+
  geom_vline(xintercept=6.686439)+
  theme_bw()+
  labs(x="Level of Democracy for Different Countries",y="Density")+
  theme(title=element_text(size=10))+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
##get the level of the democracy into variable (group)
group<-vector(length=dim(da)[1])
da<-data.frame(da,group)
highcountry<-as.character(highcountry)
lowcountry<-as.character(lowcountry)
write.csv(highcountry,file="country_high.csv")
write.csv(lowcountry,file="country_low.csv")
for (i in 1:length(highcountry))
{
  da$group[which(da$countrycode==highcountry[i])]<-1
}

for (i in 1:length(lowcountry))
{
  da$group[which(da$countrycode==lowcountry[i])]<-0
}
write.dta(da,file="final_demo.dta")
##check the number of the 
da<-read.dta13("final_demo.dta")
da1<-split(da,da$group)
da_0<-da1[[1]]
da_1<-da1[[2]]
length(names(table(da_0$countrycode)))
length(names(table(da_1$countrycode)))

##add the political system
da1<-read.dta("final_demo.dta")
da2<-read.dta13("polit_sys.dta")
da<-merge(da1,da2,by=c("year","countrycode"))
da$polit_sys[which(da$polit_sys==1)]<-0
da$polit_sys[which(da$polit_sys==2)]<-1
###remend the political system
da$polit_sys[which(da$countrycode=="TGO")]<-0
da$polit_sys[which(da$countrycode=="PAK")]<-0
da$polit_sys[which(da$countrycode=="NPL")]<-1
da$polit_sys[which(da$countrycode=="HRV")]<-1
da$polit_sys[which(da$countrycode=="ISR")]<-1
da$polit_sys[which(da$countrycode=="KHM")]<-1
da$polit_sys[which(da$countrycode=="LVA")]<-1
write.dta(da,file="final_sys.dta")

###add the Fixed broadband subscriptions and Scientific and technical journal articles
da<-read.dta13("final_sys.dta")
da1<-read.dta13("broadband.dta")
da<-merge(da,da1,by=c("year","countrycode"))
write.dta(da,file="final_sys.dta")


##check the number of political system-----
da<-read.dta13("final_sys.dta")
presi<-da[which(da$polit_sys==0),]
parli<-da[which(da$polit_sys==1),]
length(names(table(presi$countrycode)))
length(names(table(parli$countrycode)))




